Regression

source("functions.R")

On commence par importer le jeu de données et on vérifie si il y’a des valeurs manquantes, ce qui n’est pas le cas. On peut donc continuer avec l’analyse des données en vérifiant le type des variables:

On va transformer bonus_malus en binaire et retirer les variables qui ne sont pas utiles pour la prédiction comme PoliId.

library(rmarkdown)
library(dplyr)

# importation des données
train <- read.csv("./data/train_set.csv", header = T, sep = ",", dec = ".")
test <- read.csv("./data/test_set.csv", header = T, sep = ",", dec = ".")



# valeurs manquantes
sum(is.na(train))
## [1] 0
# On va transformer bonus_malus en binaire
train$Bonus_Malus <- ifelse(train$Bonus_Malus < 100, "Bonus", "Malus")
test$Bonus_Malus <- ifelse(test$Bonus_Malus < 100, "Bonus", "Malus")
train <- train %>%
    select(-PolID)
test <- test %>%
    select(-PolID)

# appercu des données
paged_table(train)

On peut maintenant continuer avec l’analyse des données en vérifiant le type des variables:

library(kableExtra)

variables <- classifier_variables_tab(train)
numeric_variables <- data.frame(variables_numériques = variables$variables_numeriques)
categorical_variables <- data.frame(variables_catégorielles = append(variables$variables_categorielles,
    variables$variables_binaires))

# categorical_variables %>%
kable(categorical_variables) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
        full_width = FALSE)
variables_catégorielles
Car_Model
Urban_rural_class
French_region
Bonus_Malus
Car_Fuel
# numeric_variables %>%
kable(numeric_variables) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
        full_width = FALSE)
variables_numériques
Claim
Period_Exp
Car_Power
Car_Age
Age
Inhab_density
# On va convertir les variables catégorielles en facteur on Obtient alors:


variables <- classifier_variables_tab(train)
numeric_variables <- variables$variables_numeriques
categorical_variables <- append(variables$variables_categorielles, variables$variables_binaires)
# convertir les varianles catégorielles en factor
train[categorical_variables] <- lapply(train[categorical_variables], factor)
test[categorical_variables] <- lapply(test[categorical_variables], factor)
str(train)
## 'data.frame':    542389 obs. of  11 variables:
##  $ Claim            : int  4 5 8 4 11 4 0 0 0 0 ...
##  $ Period_Exp       : num  0.56 1 0.41 0.27 0.08 0.1 0.96 0.73 0.09 0.73 ...
##  $ Car_Power        : int  4 7 4 5 4 4 14 10 4 5 ...
##  $ Car_Age          : int  4 9 12 9 13 1 25 2 12 4 ...
##  $ Age              : int  46 67 52 23 53 31 49 38 27 32 ...
##  $ Bonus_Malus      : Factor w/ 2 levels "Bonus","Malus": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Car_Model        : Factor w/ 11 levels "B1","B10","B11",..: 9 7 1 8 1 4 2 4 10 8 ...
##  $ Car_Fuel         : Factor w/ 2 levels "Diesel","Regular": 1 1 2 1 2 2 2 1 2 1 ...
##  $ Urban_rural_class: Factor w/ 6 levels "A","B","C","D",..: 1 5 4 5 4 5 5 3 3 3 ...
##  $ Inhab_density    : int  29 4762 824 6924 824 2983 5053 160 229 461 ...
##  $ French_region    : Factor w/ 22 levels "Alsace","Aquitaine",..: 7 21 13 12 13 17 12 20 6 6 ...

##Étude des variables catégorielles:

0.1 Car Model

plot_categorical(train, "Car_Model")

plot_percentage(train, "Car_Model")

plot_hist_by_claim(train, "Car_Model")

0.2 Bonus_Malus

plot_categorical(train, "Bonus_Malus")

plot_percentage(train, "Bonus_Malus")

plot_hist_by_claim(train, "Bonus_Malus")

0.3 Urban_rural_class

plot_categorical(train, "Urban_rural_class")

plot_percentage(train, "Urban_rural_class")

plot_hist_by_claim(train, "Urban_rural_class")

0.4 Car_Fuel

plot_categorical(train, "Car_Fuel")

plot_percentage(train, "Car_Fuel")

plot_hist_by_claim(train, "Car_Fuel")

library(vcd)
mosaic(~Car_Fuel + Bonus_Malus, data = train, shade = TRUE)

# assocplot(table(train$Car_Fuel, train$Bonus_Malus))

0.5 French_region

source("functions.R")
plot_categorical(train, "French_region")

plot_percentage(train, "French_region")

plot_hist_by_claim(train, "French_region")

plot_claims_by_region(train, "./data/regions-avant-redecoupage-2015.geojson")

1 Étude des variables numériques

1.1 Inhab_density

plot_numeric <- function(data, variable) {
    p1 <- ggplot(data, aes_string(x = variable)) + geom_histogram(aes(y = ..density..),
        bins = 30, fill = "lightblue", color = "black") + geom_density(alpha = 0.2,
        fill = "#FF6666") + labs(title = paste("Distribution de la variable", variable)) +
        theme_bw()
    p2 <- ggplot(data, aes_string(x = variable)) + geom_boxplot(fill = "lightblue",
        color = "black") + labs(title = paste("Boxplot de la variable", variable)) +
        theme_bw()

    p3 <- ggplot(train, aes(x = .data[[variable]], y = Claim)) + geom_point(alpha = 0.6,
        color = "darkorange") + labs(title = paste("Relation entre", variable, "et nombre de sinistres"),
        x = variable, y = "Nombre de sinistres") + theme_minimal()

    print(p1)
    print(p2)
    print(p3)
}

box_plot <- function(data, col) {
    data$Claim <- as.factor(data$Claim)

    p1 <- ggplot(data, aes(x = Claim, y = .data[[col]], fill = Claim)) + geom_boxplot() +
        labs(title = paste("Distribution de", col, " par Claim"), x = "Claim", y = col) +
        theme_bw()

    # Histogram with 20 bins Histogram
    p2 <- ggplot(data, aes(x = .data[[col]], fill = Claim)) + geom_histogram(color = "black",
        bins = 20, alpha = 1) + labs(title = paste("Histogramme de", col, "par Claim"),
        x = col, y = "Nombre") + theme_bw()

    return(p2)
}


plot_numeric(train, "Inhab_density")

box_plot(train, "Inhab_density")

print(sum(train$Inhab_density < 1))
## [1] 0

1.2 Age

plot_numeric(train, "Age")

print(sum(train$Age > 80))
## [1] 4943

1.3 Car_Age

plot_numeric(train, "Car_Age")

1.4 Car_Power

plot_numeric(train, "Car_Power")

1.5 Period_Exp

plot_numeric(train, "Period_Exp")

2 Analyse de la target

source("functions.R")
plot_percentage(train, "Claim", 5)

## Analyse des corrélations

Une heatmap pour visualiser les corrélations entre les variables numériques.

library(reshape2)
library(corrplot)

# Distribution des variables numériques
num_vars <- train[, c("Claim", "Period_Exp", "Car_Power", "Car_Age", "Age", "Inhab_density")]


corr_matrix <- cor(num_vars)
melted_cor <- melt(corr_matrix)
ggplot(data = melted_cor, aes(x = Var1, y = Var2, fill = value)) + geom_tile() +
    scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0) +
    labs(title = "Heatmap des corrélations", x = "", y = "")

corrplot(corr_matrix, method = "circle")